WHO <- read.csv("WHO.csv")
str(WHO)
## 'data.frame': 194 obs. of 13 variables:
## $ Country : Factor w/ 194 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Region : Factor w/ 6 levels "Africa","Americas",..: 3 4 1 4 1 2 2 4 6 4 ...
## $ Population : int 29825 3162 38482 78 20821 89 41087 2969 23050 8464 ...
## $ Under15 : num 47.4 21.3 27.4 15.2 47.6 ...
## $ Over60 : num 3.82 14.93 7.17 22.86 3.84 ...
## $ FertilityRate : num 5.4 1.75 2.83 NA 6.1 2.12 2.2 1.74 1.89 1.44 ...
## $ LifeExpectancy : int 60 74 73 82 51 75 76 71 82 81 ...
## $ ChildMortality : num 98.5 16.7 20 3.2 163.5 ...
## $ CellularSubscribers : num 54.3 96.4 99 75.5 48.4 ...
## $ LiteracyRate : num NA NA NA NA 70.1 99 97.8 99.6 NA NA ...
## $ GNI : num 1140 8820 8310 NA 5230 ...
## $ PrimarySchoolEnrollmentMale : num NA NA 98.2 78.4 93.1 91.1 NA NA 96.9 NA ...
## $ PrimarySchoolEnrollmentFemale: num NA NA 96.4 79.4 78.2 84.5 NA NA 97.5 NA ...
plot(WHO$GNI, WHO$FertilityRate)
library(ggplot2)
scatterplot = ggplot(WHO, aes(x = GNI, y = FertilityRate))
scatterplot + geom_point()
## Warning: Removed 35 rows containing missing values (geom_point).
scatterplot + geom_point(color = "blue", size = 3, shape=17)
## Warning: Removed 35 rows containing missing values (geom_point).
fertilityGNIplot = scatterplot + geom_point(color = "darkred", size = 3, shape=8) + ggtitle("Fertility rate vs. GNI")
# save in file
pdf("Myplot.pdf")
print(fertilityGNIplot)
## Warning: Removed 35 rows containing missing values (geom_point).
# back to terminal
dev.off()
## quartz_off_screen
## 2
scatterplot + geom_point(color = "blue", size = 3, shape=15)
## Warning: Removed 35 rows containing missing values (geom_point).
ggplot(WHO,aes(x=GNI,y=FertilityRate,color=Region))+geom_point()
## Warning: Removed 35 rows containing missing values (geom_point).
ggplot(WHO,aes(x=GNI,y=FertilityRate,color=LifeExpectancy))+geom_point()
## Warning: Removed 35 rows containing missing values (geom_point).
ggplot(WHO,aes(x=FertilityRate,y=Under15))+geom_point()
## Warning: Removed 11 rows containing missing values (geom_point).
ggplot(WHO,aes(x=log(FertilityRate),y=Under15))+geom_point() + stat_smooth(method="lm",se=FALSE,color="orange")
## Warning: Removed 11 rows containing missing values (stat_smooth).
## Warning: Removed 11 rows containing missing values (geom_point).
model = lm(Under15~log(FertilityRate),data = WHO)
summary(model)
##
## Call:
## lm(formula = Under15 ~ log(FertilityRate), data = WHO)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.3131 -1.7742 0.0446 1.7440 7.7174
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.6540 0.4478 17.09 <2e-16 ***
## log(FertilityRate) 22.0547 0.4175 52.82 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 181 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.9391, Adjusted R-squared: 0.9387
## F-statistic: 2790 on 1 and 181 DF, p-value: < 2.2e-16
ggplot(WHO, aes(x = FertilityRate, y = Under15, color=Region)) + geom_point()+scale_color_brewer(palette="Dark2")
## Warning: Removed 11 rows containing missing values (geom_point).
mvt = read.csv("mvt.csv",stringsAsFactor = FALSE)
str(mvt)
## 'data.frame': 191641 obs. of 3 variables:
## $ Date : chr "12/31/12 23:15" "12/31/12 22:00" "12/31/12 22:00" "12/31/12 22:00" ...
## $ Latitude : num 41.8 41.9 42 41.8 41.8 ...
## $ Longitude: num -87.6 -87.7 -87.8 -87.7 -87.6 ...
mvt$Date = strptime(mvt$Date,format="%m/%d/%y %H:%M")
mvt$Weekday = weekdays(mvt$Date)
mvt$Hour = mvt$Date$hour
table(mvt$Weekday)
##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 29284 27397 27118 26316 27319 26791 27416
WeekdayCounts = as.data.frame(table(mvt$Weekday))
str(WeekdayCounts)
## 'data.frame': 7 obs. of 2 variables:
## $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7
## $ Freq: int 29284 27397 27118 26316 27319 26791 27416
library(ggplot2)
# lineplot
WeekdayCounts$Var1 = factor(WeekdayCounts$Var1,ordered = TRUE,levels=c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"))
ggplot(WeekdayCounts,aes(x=Var1,y=Freq))+geom_line(aes(group=1),linetype = 2)+xlab("Day of the Week")+ylab("Total Motorvehicle Thefts")
ggplot(WeekdayCounts,aes(x=Var1,y=Freq))+geom_line(aes(group=1),alpha = 0.3)+xlab("Day of the Week")+ylab("Total Motorvehicle Thefts")
# heat map
table(mvt$Weekday,mvt$Hour)
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## Friday 1873 932 743 560 473 602 839 1203 1268 1286 938 822
## Monday 1900 825 712 527 415 542 772 1123 1323 1235 971 737
## Saturday 2050 1267 985 836 652 508 541 650 858 1039 946 789
## Sunday 2028 1236 1019 838 607 461 478 483 615 864 884 787
## Thursday 1856 816 696 508 400 534 799 1135 1298 1301 932 731
## Tuesday 1691 777 603 464 414 520 845 1118 1175 1174 948 786
## Wednesday 1814 790 619 469 396 561 862 1140 1329 1237 947 763
##
## 12 13 14 15 16 17 18 19 20 21 22 23
## Friday 1207 857 937 1140 1165 1318 1623 1652 1736 1881 2308 1921
## Monday 1129 824 958 1059 1136 1252 1518 1503 1622 1815 2009 1490
## Saturday 1204 767 963 1086 1055 1084 1348 1390 1570 1702 2078 1750
## Sunday 1192 789 959 1037 1083 1160 1389 1342 1706 1696 2079 1584
## Thursday 1093 752 831 1044 1131 1258 1510 1537 1668 1776 2134 1579
## Tuesday 1108 762 908 1071 1090 1274 1553 1496 1696 1816 2044 1458
## Wednesday 1225 804 863 1075 1076 1289 1580 1507 1718 1748 2093 1511
DayHourCounts = as.data.frame(table(mvt$Weekday,mvt$Hour))
str(DayHourCounts)
## 'data.frame': 168 obs. of 3 variables:
## $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7 1 2 3 ...
## $ Var2: Factor w/ 24 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ Freq: int 1873 1900 2050 2028 1856 1691 1814 932 825 1267 ...
DayHourCounts$Hour = as.numeric(as.character(DayHourCounts$Var2))
str(DayHourCounts)
## 'data.frame': 168 obs. of 4 variables:
## $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7 1 2 3 ...
## $ Var2: Factor w/ 24 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ Freq: int 1873 1900 2050 2028 1856 1691 1814 932 825 1267 ...
## $ Hour: num 0 0 0 0 0 0 0 1 1 1 ...
ggplot(DayHourCounts,aes(x=Hour,y=Freq))+geom_line(aes(group=Var1,color=Var1),size=2)
DayHourCounts$Var1 = factor(DayHourCounts$Var1,ordered=TRUE, levels=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
ggplot(DayHourCounts,aes(x = Hour,y=Var1))+geom_tile(aes(fill=Freq))+scale_fill_gradient(name="Total MV Theft",low="white",high="red") + theme(axis.title.y = element_blank())
#install.packages("maps")
#install.packages("ggmap")
library(maps)
library(ggmap)
chicago = get_map(location="chicago",zoom=11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=chicago&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=chicago&sensor=false
ggmap(chicago) + geom_point(data = mvt[1:100,],aes(x=Longitude,y=Latitude))
## Warning: Removed 7 rows containing missing values (geom_point).
LatLonCounts = as.data.frame(table(round(mvt$Longitude,2),round(mvt$Latitude,2)))
str(LatLonCounts)
## 'data.frame': 1638 obs. of 3 variables:
## $ Var1: Factor w/ 42 levels "-87.93","-87.92",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Var2: Factor w/ 39 levels "41.64","41.65",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq: int 0 0 0 0 0 0 0 0 0 0 ...
#convert factor to numeric
LatLonCounts$Long = as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat = as.numeric(as.character(LatLonCounts$Var2))
ggmap(chicago) + geom_point(data = LatLonCounts, aes(x = Long, y = Lat, color = Freq, size=Freq)) + scale_colour_gradient(low="yellow", high="red")
## Warning: Removed 615 rows containing missing values (geom_point).
ggmap(chicago) + geom_tile(data = LatLonCounts,aes(x=Long,y=Lat,alpha = Freq),fill = "red")
LatLonCounts2 = subset(LatLonCounts,Freq>0)
ggmap(chicago) + geom_tile(data = LatLonCounts2,aes(x=Long,y=Lat,alpha = Freq),fill = "red")
ggmap(chicago) + geom_point(data = LatLonCounts2, aes(x = Long, y = Lat, color = Freq, size=Freq)) + scale_colour_gradient(low="yellow", high="red")
## Warning: Removed 119 rows containing missing values (geom_point).
murders = read.csv("murders.csv")
str(murders)
## 'data.frame': 51 obs. of 6 variables:
## $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Population : int 4779736 710231 6392017 2915918 37253956 5029196 3574097 897934 601723 19687653 ...
## $ PopulationDensity: num 94.65 1.26 57.05 56.43 244.2 ...
## $ Murders : int 199 31 352 130 1811 117 131 48 131 987 ...
## $ GunMurders : int 135 19 232 93 1257 65 97 38 99 669 ...
## $ GunOwnership : num 0.517 0.578 0.311 0.553 0.213 0.347 0.167 0.255 0.036 0.245 ...
statesMap=map_data("state")
str(statesMap)
## 'data.frame': 15537 obs. of 6 variables:
## $ long : num -87.5 -87.5 -87.5 -87.5 -87.6 ...
## $ lat : num 30.4 30.4 30.4 30.3 30.3 ...
## $ group : num 1 1 1 1 1 1 1 1 1 1 ...
## $ order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ region : chr "alabama" "alabama" "alabama" "alabama" ...
## $ subregion: chr NA NA NA NA ...
ggplot(statesMap,aes(x=long, y = lat, group=group)) +geom_polygon(fill = "white",color = "black")
murders$region=tolower(murders$State)
murderMap = merge(statesMap,murders,by ="region")
str(murderMap)
## 'data.frame': 15537 obs. of 12 variables:
## $ region : chr "alabama" "alabama" "alabama" "alabama" ...
## $ long : num -87.5 -87.5 -87.5 -87.5 -87.6 ...
## $ lat : num 30.4 30.4 30.4 30.3 30.3 ...
## $ group : num 1 1 1 1 1 1 1 1 1 1 ...
## $ order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ subregion : chr NA NA NA NA ...
## $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Population : int 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 ...
## $ PopulationDensity: num 94.7 94.7 94.7 94.7 94.7 ...
## $ Murders : int 199 199 199 199 199 199 199 199 199 199 ...
## $ GunMurders : int 135 135 135 135 135 135 135 135 135 135 ...
## $ GunOwnership : num 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 ...
ggplot(murderMap,aes(x=long,y=lat,group=group,fill=Murders))+geom_polygon(color="black")+scale_fill_gradient(low="black",high="red",guide="legend")
ggplot(murderMap,aes(x=long,y=lat,group=group,fill=Population))+geom_polygon(color="black")+scale_fill_gradient(low="black",high="red",guide="legend")
murderMap$MurderRate = murderMap$Murders/murderMap$Population*100000
ggplot(murderMap,aes(x=long,y=lat,group=group,fill=MurderRate))+geom_polygon(color="black")+scale_fill_gradient(low="black",high="red",guide="legend",limits=c(0,10))
#fill each state with the variable GunOwnership
murderMap$GunOwnershipRate = murderMap$GunOwnership/murderMap$Population*100000
ggplot(murderMap,aes(x=long,y=lat,group=group,fill=GunOwnershipRate))+geom_polygon(color="black")+scale_fill_gradient(low="black",high="red",guide="legend")
In the recitation from Unit 3, we used logistic regression on polling data in order to construct US presidential election predictions. We separated our data into a training set, containing data from 2004 and 2008 polls, and a test set, containing the data from 2012 polls. We then proceeded to develop a logistic regression model to forecast the 2012 US presidential election.
In this homework problem, we’ll revisit our logistic regression model from Unit 3, and learn how to plot the output on a map of the United States. Unlike what we did in the Crime lecture, this time we’ll be plotting predictions rather than data! ### Drawing a map of the United States
library(ggplot2)
library(maps)
library(ggmap)
statesMap = map_data("state")
str(statesMap)
## 'data.frame': 15537 obs. of 6 variables:
## $ long : num -87.5 -87.5 -87.5 -87.5 -87.6 ...
## $ lat : num 30.4 30.4 30.4 30.3 30.3 ...
## $ group : num 1 1 1 1 1 1 1 1 1 1 ...
## $ order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ region : chr "alabama" "alabama" "alabama" "alabama" ...
## $ subregion: chr NA NA NA NA ...
#How many different groups are there?
table(statesMap$group)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 202 149 312 516 79 91 94 10 872 381 233 329 257 256 113
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 397 650 399 566 36 220 30 460 370 373 382 315 238 208 70
## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## 125 205 78 16 290 21 168 37 733 12 105 238 284 236 172
## 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 66 304 166 289 1088 59 129 96 15 623 17 17 19 44 448
## 61 62 63
## 373 388 68
ggplot(statesMap, aes(x = long, y = lat, group = group)) + geom_polygon(fill = "white", color = "black")
### Coloring the States by Predictions
polling<-read.csv("PollingImputed.csv")
str(polling)
## 'data.frame': 145 obs. of 7 variables:
## $ State : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 2 2 3 3 3 4 4 4 ...
## $ Year : int 2004 2008 2004 2008 2004 2008 2012 2004 2008 2012 ...
## $ Rasmussen : int 11 21 19 16 5 5 8 7 10 13 ...
## $ SurveyUSA : int 18 25 21 18 15 3 5 5 7 21 ...
## $ DiffCount : int 5 5 1 6 8 9 4 8 5 2 ...
## $ PropR : num 1 1 1 1 1 ...
## $ Republican: int 1 1 1 1 1 1 1 1 1 1 ...
Train <- subset(polling,Year == 2004|Year ==2008 )
Test <- subset(polling,Year ==2012)
nrow(Train)+nrow(Test)== nrow(polling)
## [1] TRUE
mod2 = glm(Republican~SurveyUSA + DiffCount, data = Train, family = "binomial")
TestPrediction = predict(mod2,newdata = Test, type = "response")
TestPredictionBinary = as.numeric(TestPrediction>0.5)
predictionDataFrame= data.frame(TestPrediction,TestPredictionBinary,Test$State)
str(predictionDataFrame)
## 'data.frame': 45 obs. of 3 variables:
## $ TestPrediction : num 9.74e-01 9.99e-01 9.26e-05 9.43e-03 3.43e-05 ...
## $ TestPredictionBinary: num 1 1 0 0 0 1 1 0 1 0 ...
## $ Test.State : Factor w/ 50 levels "Alabama","Alaska",..: 3 4 5 6 7 9 10 11 12 13 ...
#For how many states is our binary prediction 1 (for 2012), corresponding to Republican?
table(predictionDataFrame$TestPredictionBinary)
##
## 0 1
## 23 22
#What is the average predicted probability of our model (on the Test set, for 2012)?
summary(predictionDataFrame$TestPrediction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000001 0.0000926 0.0648700 0.4853000 0.9986000 0.9999000
predictionDataFrame$region = tolower(predictionDataFrame$Test.State)
predictionMap <- merge(statesMap,predictionDataFrame,by="region")
#make sure the observations are in order so that the map is drawn properly,
predictionMap <- predictionMap[order(predictionMap$order),]
str(predictionMap)
## 'data.frame': 15034 obs. of 9 variables:
## $ region : chr "arizona" "arizona" "arizona" "arizona" ...
## $ long : num -115 -115 -115 -115 -115 ...
## $ lat : num 35 35.1 35.1 35.2 35.2 ...
## $ group : num 2 2 2 2 2 2 2 2 2 2 ...
## $ order : int 204 205 206 207 208 209 210 211 212 213 ...
## $ subregion : chr NA NA NA NA ...
## $ TestPrediction : num 0.974 0.974 0.974 0.974 0.974 ...
## $ TestPredictionBinary: num 1 1 1 1 1 1 1 1 1 1 ...
## $ Test.State : Factor w/ 50 levels "Alabama","Alaska",..: 3 3 3 3 3 3 3 3 3 3 ...
str(statesMap) # only make prediction for 45 states, so there are less observation in the predictionMap than statesMap
## 'data.frame': 15537 obs. of 6 variables:
## $ long : num -87.5 -87.5 -87.5 -87.5 -87.6 ...
## $ lat : num 30.4 30.4 30.4 30.3 30.3 ...
## $ group : num 1 1 1 1 1 1 1 1 1 1 ...
## $ order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ region : chr "alabama" "alabama" "alabama" "alabama" ...
## $ subregion: chr NA NA NA NA ...
# color the map with predictions
ggplot(predictionMap,aes(x = long, y = lat, group = group, fill = TestPredictionBinary )) + geom_polygon(color="black")
# replot with discrete outcome and use red (republican) and blue (democrates)
ggplot(predictionMap,aes(x = long, y = lat, group = group, fill = TestPredictionBinary )) + geom_polygon(color="black") + scale_fill_gradient(low = "blue", high = "red", guide = "legend", breaks= c(0,1), labels = c("Democrat", "Republican"), name = "Prediction 2012")
# plot the probabilities instead of the binary predictions.
ggplot(predictionMap,aes(x = long, y = lat, group = group, fill = TestPrediction )) + geom_polygon(color="black") + scale_fill_gradient(low = "blue", high = "red", guide = "legend", breaks= c(0,1), labels = c("Democrat", "Republican"), name = "Prediction 2012")
# What was our predicted probability for the state of Florida (that was incorrectly predicted to be Republican)
predictionDataFrame$TestPrediction[predictionDataFrame$region =="florida"]
## [1] 0.9640395
ggplot(predictionMap,aes(x = long, y = lat, group = group, fill = TestPredictionBinary )) + geom_polygon(color="black",linetype=3) + scale_fill_gradient(low = "blue", high = "red", guide = "legend", breaks= c(0,1), labels = c("Democrat", "Republican"), name = "Prediction 2012")
ggplot(predictionMap,aes(x = long, y = lat, group = group, fill = TestPredictionBinary )) + geom_polygon(color="black",size=3) + scale_fill_gradient(low = "blue", high = "red", guide = "legend", breaks= c(0,1), labels = c("Democrat", "Republican"), name = "Prediction 2012")
ggplot(predictionMap,aes(x = long, y = lat, group = group, fill = TestPredictionBinary )) + geom_polygon(color="black",alpha=0.3) + scale_fill_gradient(low = "blue", high = "red", guide = "legend", breaks= c(0,1), labels = c("Democrat", "Republican"), name = "Prediction 2012")
The cliche goes that the world is an increasingly interconnected place, and the connections between different entities are often best represented with a graph. Graphs are comprised of vertices (also often called “nodes”) and edges connecting those nodes. In this assignment, we will learn how to visualize networks using the igraph package in R.
For this assignment, we will visualize social networking data using anonymized data from Facebook; this data was originally curated in a recent paper about computing social circles in social networks. In our visualizations, the vertices in our network will represent Facebook users and the edges will represent these users being Facebook friends with each other.
edges<- read.csv("edges.csv")
users<-read.csv("users.csv")
str(users)
## 'data.frame': 59 obs. of 4 variables:
## $ id : int 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 ...
## $ gender: Factor w/ 3 levels "","A","B": 2 3 3 3 3 3 2 3 3 2 ...
## $ school: Factor w/ 3 levels "","A","AB": 2 1 1 1 1 2 1 1 2 1 ...
## $ locale: Factor w/ 3 levels "","A","B": 3 3 3 3 3 3 2 3 3 2 ...
str(edges)
## 'data.frame': 146 obs. of 2 variables:
## $ V1: int 4019 4023 4023 4027 3988 3982 3994 3998 3993 3982 ...
## $ V2: int 4026 4031 4030 4032 4021 3986 3998 3999 3995 4021 ...
#How many Facebook users are there in our dataset?
nrow(users)
## [1] 59
#what is the average number of friends per user?
friends = c(7,13,1,0,5,8,1,6,5,3,2,2,5,10,8,3,3,10,13,3,8,1,6,4,9,2,1,3,0,9,0,3,1,5,11,0,3,8,6,7,7,10,0,17,0,3,8,6,1,1,18,10,1,2,1,0,1,3,8)
sum(friends)/nrow(users)
## [1] 4.949153
avgFriends = nrow(edges)*2 / nrow(users)
#Out of all the students who listed a school, what was the most common locale?
summary(users)
## id gender school locale
## Min. : 594 : 2 :40 : 3
## 1st Qu.:3994 A:15 A :17 A: 6
## Median :4009 B:42 AB: 2 B:50
## Mean :3952
## 3rd Qu.:4024
## Max. :4038
#Is it possible that either school A or B is an all-girls or all-boys school?
#install.packages("igraph")
library(igraph)
g = graph.data.frame(edges,FALSE,users)
str(g)
## IGRAPH UN-- 59 146 --
## + attr: name (v/c), gender (v/c), school (v/c), locale (v/c)
## + edges (vertex names):
## 3981 -- 3994, 3997, 3998, 4009, 4018, 4019, 4023
## 3982 -- 3986, 3988, 3994, 3997, 3998, 4003, 4009, 4014, 4021,
## 4023, 4026, 4030, 4037
## 3983 -- 4017
## 3984 --
## 3985 -- 3988, 3993, 3995, 4004, 4014
## 3986 -- 3982, 4000, 4014, 4017, 4021, 4026, 4030, 4033
## 3987 -- 4012
## 3988 -- 3982, 3985, 3993, 3995, 4021, 4030
## 3989 -- 3991, 594, 4011, 4013, 4038
## 3990 -- 4007, 4016, 4025
## 3991 -- 3989, 4031
## 3992 -- 4000, 4017
## 3993 -- 3985, 3988, 3995, 4004, 4030
## 3994 -- 3981, 3982, 3996, 3997, 3998, 4009, 4018, 4019, 4023, 4030
## 3995 -- 3985, 3988, 3993, 4000, 4004, 4014, 4023, 4026
## 594 -- 3989, 4011, 4031
## 3996 -- 3994, 4002, 4028
## 3997 -- 3981, 3982, 3994, 3998, 4009, 4018, 4019, 4021, 4023, 4030
## 3998 -- 3981, 3982, 3994, 3997, 3999, 4002, 4005, 4009, 4014,
## 4018, 4019, 4021, 4023
## 3999 -- 3998, 4005, 4036
## 4000 -- 3986, 3992, 3995, 4017, 4021, 4026, 4030, 4033
## 4001 -- 4029
## 4002 -- 3996, 3998, 4020, 4023, 4027, 4031
## 4003 -- 3982, 4009, 4023, 4030
## 4004 -- 3985, 3993, 3995, 4013, 4020, 4023, 4030, 4031, 4038
## 4005 -- 3998, 3999
## 4006 -- 4027
## 4007 -- 3990, 4016, 4025
## 4008 --
## 4009 -- 3981, 3982, 3994, 3997, 3998, 4003, 4019, 4023, 4030
## 4010 --
## 4011 -- 3989, 594, 4031
## 4012 -- 3987
## 4013 -- 3989, 4004, 4023, 4031, 4038
## 4014 -- 3982, 3985, 3986, 3995, 3998, 4017, 4021, 4023, 4030,
## 4037, 4038
## 4015 --
## 4016 -- 3990, 4007, 4025
## 4017 -- 3983, 3986, 3992, 4000, 4014, 4021, 4026, 4030
## 4018 -- 3981, 3994, 3997, 3998, 4023, 4030
## 4019 -- 3981, 3994, 3997, 3998, 4009, 4026, 4030
## 4020 -- 4002, 4004, 4027, 4030, 4031, 4037, 4038
## 4021 -- 3982, 3986, 3988, 3997, 3998, 4000, 4014, 4017, 4026, 4030
## 4022 --
## 4023 -- 3981, 3982, 3994, 3995, 3997, 3998, 4002, 4003, 4004,
## 4009, 4013, 4014, 4018, 4030, 4031, 4034, 4038
## 4024 --
## 4025 -- 3990, 4007, 4016
## 4026 -- 3982, 3986, 3995, 4000, 4017, 4019, 4021, 4030
## 4027 -- 4002, 4006, 4020, 4031, 4032, 4038
## 4028 -- 3996
## 4029 -- 4001
## 4030 -- 3982, 3986, 3988, 3993, 3994, 3997, 4000, 4003, 4004,
## 4009, 4014, 4017, 4018, 4019, 4020, 4021, 4023, 4026
## 4031 -- 3991, 594, 4002, 4004, 4011, 4013, 4020, 4023, 4027, 4038
## 4032 -- 4027
## 4033 -- 3986, 4000
## 4034 -- 4023
## 4035 --
## 4036 -- 3999
## 4037 -- 3982, 4014, 4020
## 4038 -- 3989, 4004, 4013, 4014, 4020, 4023, 4027, 4031
#get.data.frame(g, what=c("both"))
plot(g,vertex.size=5,vertex.label = NA)
# change size of the vertices
V(g)$size=degree(g)/2+2
plot(g,vertex.label=NA)
V(g)$size
## [1] 5.5 8.5 2.5 2.0 4.5 6.0 2.5 5.0 4.5 3.5 3.0 3.0 4.5 7.0
## [15] 6.0 3.5 3.5 7.0 8.5 3.5 6.0 2.5 5.0 4.0 6.5 3.0 2.5 3.5
## [29] 2.0 6.5 2.0 3.5 2.5 4.5 7.5 2.0 3.5 6.0 5.0 5.5 5.5 7.0
## [43] 2.0 10.5 2.0 3.5 6.0 5.0 2.5 2.5 11.0 7.0 2.5 3.0 2.5 2.0
## [57] 2.5 3.5 6.0
summary(V(g)$size)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.500 3.500 4.475 6.000 11.000
#When we created our graph g, we provided it with the data frame users, which had variables gender, school, and locale. These are now stored as attributes V(g)$gender, V(g)$school, and V(g)$locale.
# Now color the vertices based on the gender of the user,
V(g)$color = "black"
V(g)$color[V(g)$gender == "A"] = "red"
V(g)$color[V(g)$gender=="B"] = "grey"
plot(g,vertex.label = NA)
#Now, color the vertices based on the school that each user in our network attended.
V(g)$color = "black"
V(g)$color[V(g)$school == "A"] = "blue"
V(g)$color[V(g)$school == "AB"] = "red"
plot(g,vertex.label = NA)
#Now, color the vertices based on the locale of the user.
V(g)$color = "black"
V(g)$color[V(g)$locale == "A"] = "blue"
V(g)$color[V(g)$locale == "B"] = "red"
plot(g,vertex.label = NA)
Earlier in the course, we used text analytics as a predictive tool, using word frequencies as independent variables in our models. However, sometimes our goal is to understand commonly occurring topics in text data instead of to predict the value of some dependent variable. In such cases, word clouds can be a visually appealing way to display the most frequent words in a body of text.A word cloud arranges the most common words in some text, using size to indicate the frequency of a word. While we could generate word clouds using free generators available on the Internet, we will have more flexibility and control over the process if we do so in R. We will visualize the text of tweets about Apple, a dataset we used earlier in the course.
tweets<-read.csv("tweets.csv",stringsAsFactor = FALSE)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
corpus <- Corpus(VectorSource(tweets))
corpus<-tm_map(corpus,tolower)
corpus = tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus,removePunctuation)
corpus <- tm_map(corpus,removeWords,stopwords("english"))
dtm <- DocumentTermMatrix(corpus)
dtm
## <<DocumentTermMatrix (documents: 2, terms: 3780)>>
## Non-/sparse entries: 3780/3780
## Sparsity : 50%
## Maximal term length: 115
## Weighting : term frequency (tf)
allTweets <- as.data.frame(as.matrix(dtm))
str(allTweets)
## 'data.frame': 2 obs. of 3780 variables:
## $ 000 : num 1 0
## $ 075 : num 3 0
## $ 0909 : num 1 0
## $ 0910 : num 1 0
## $ 099 : num 1 0
## $ 100 : num 2 0
## $ 100m : num 1 0
## $ 1085 : num 3 0
## $ 10min : num 1 0
## $ 110 : num 1 0
## $ 13apple : num 1 0
## $ 13th : num 1 0
## $ 1415 : num 1 0
## $ 16gb : num 1 0
## $ 16gbs : num 1 0
## $ 180 : num 1 0
## $ 18092013 : num 1 0
## $ 18th : num 2 0
## $ 199 : num 3 0
## $ 1am : num 1 0
## $ 1jazzyjeff : num 1 0
## $ 1st : num 1 0
## $ 200 : num 2 0
## $ 2000ad : num 1 0
## $ 2001 : num 1 0
## $ 2002 : num 1 0
## $ 2004 : num 1 0
## $ 2005 : num 2 0
## $ 2011with : num 1 0
## $ 2013 : num 4 0
## $ 2014 : num 3 0
## $ 20th : num 4 0
## $ 211 : num 1 0
## $ 21st : num 1 0
## $ 22000 : num 1 0
## $ 22nd : num 1 0
## $ 244tsuyoponzu : num 6 0
## $ 2nd : num 2 0
## $ 2shaneez : num 1 0
## $ 2week : num 1 0
## $ 300 : num 2 0
## $ 30aud : num 1 0
## $ 30mins : num 1 0
## $ 320k : num 1 0
## $ 350 : num 1 0
## $ 3gs : num 1 0
## $ 3rd : num 1 0
## $ 3yr : num 1 0
## $ 40000 : num 1 0
## $ 40k : num 1 0
## $ 40mb : num 1 0
## $ 4eva : num 1 0
## $ 500 : num 1 0
## $ 528 : num 1 0
## $ 549 : num 2 0
## $ 550 : num 1 0
## $ 55cs : num 1 0
## $ 55mb : num 1 0
## $ 5cheap : num 2 0
## $ 5s5c : num 2 0
## $ 5same : num 1 0
## $ 5sc : num 2 0
## $ 5sdo : num 1 0
## $ 5slfw : num 1 0
## $ 5th : num 2 0
## $ 600 : num 1 0
## $ 6000 : num 1 0
## $ 64bit : num 4 0
## $ 65min : num 1 0
## $ 6monthlifespan : num 1 0
## $ 719 : num 1 0
## $ 740 : num 1 0
## $ 777 : num 1 0
## $ 7evenstarz : num 7 0
## $ 7pm : num 1 0
## $ 7wow : num 1 0
## $ 86m : num 1 0
## $ 884 : num 1 0
## $ 899 : num 1 0
## $ 8bitsound : num 1 0
## $ 900 : num 1 0
## $ 917 : num 1 0
## $ 99preorder : num 1 0
## $ a7s : num 1 0
## $ aaaaaapple : num 1 0
## $ aapl : num 2 0
## $ abrsm : num 1 0
## $ absolutely : num 4 0
## $ abt : num 2 0
## $ acampan : num 1 0
## $ acceptable : num 1 0
## $ access : num 4 0
## $ accidentally : num 2 0
## $ acciones : num 3 0
## $ according : num 1 0
## $ accordingtonina : num 1 0
## $ account : num 4 0
## $ accurate : num 1 0
## $ ace : num 1 0
## [list output truncated]
ncol(unique(allTweets))
## [1] 3780
#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
#Which function can we apply to allTweets to get a vector of the words in our dataset, which we'll pass as the first argument to wordcloud()?
words<- colnames(allTweets)
#Which function should we apply to allTweets to obtain the frequency of each word across all tweets?
freq<- colSums(allTweets)
wordcloud(words,freq,scale=c(2, 0.25))
# remove "apple" (the most frequent word) from the corpus
wordToRemove = c("apple")
corpusNoApple<-tm_map(corpus,removeWords,wordToRemove)
dtmNoApple <- DocumentTermMatrix(corpusNoApple)
dtmNoApple
## <<DocumentTermMatrix (documents: 2, terms: 3779)>>
## Non-/sparse entries: 3779/3779
## Sparsity : 50%
## Maximal term length: 115
## Weighting : term frequency (tf)
allTweetsNoApple <- as.data.frame(as.matrix(dtmNoApple))
wordsNoApple<- colnames(allTweetsNoApple)
# check if 'apple' is still in the list
'apple' %in% wordsNoApple
## [1] FALSE
# new data frame without 'apple'
freqNoApple<- colSums(allTweetsNoApple)
wordcloud(wordsNoApple,freqNoApple,scale=c(2, 0.25), random.order=FALSE,min.freq=3,max.words=Inf)
# changing min.freq, max.words
wordcloud(wordsNoApple,freqNoApple,scale=c(2, 0.25), random.order=FALSE,min.freq=10,max.words=100)
# chaning rot.per
wordcloud(wordsNoApple,freqNoApple,scale=c(2, 0.25), random.order=FALSE,rot.per=0.9)
# setting colors using brewer.pal
wordcloud(wordsNoApple,freqNoApple,scale=c(2, 0.25), random.order=FALSE, rot.per=0.1,random.color=FALSE,colors=brewer.pal(9, "Blues")[c(5, 6, 7, 8, 9)])
# same as above
#wordcloud(wordsNoApple,freqNoApple,scale=c(2, 0.25), random.order=FALSE, rot.per=0.1,random.color=FALSE,colors=brewer.pal(9, "Blues")[c(-1, -2, -3, -4)])